#!/bin/sh
#
# Generate a C implementation of wcwidth, with latest unicode data
# from a local clone of https://github.com/jquast/wcwidth
#
# The MIT License (MIT)
#
# Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

export LC_ALL=C
self=${0##*/}

# c-types (bigger types work but waste memory. uintN_t need <stdint.h>)
u32=uint32_t  # "unsigned" is also typically 32 bit
u16=uint16_t  # "unsigned short" is also typically 16 bits
FUNC_ATTR=FAST_FUNC # delete this line if not generating a busybox function


err() { >&2 printf %s\\n "$self: $*"; exit 1; }

case ${1-} in -h | --help)
	echo "Usage: $self [path/to/python-wcwidth]   (default path is '.')"
	echo "Prints a wcwidth C implementation, with latest Unicode data"
	echo "imported from a local https://github.com/jquast/wcwidth repo."
	echo "Assumptions about table_zero.py and table_wide.py at the repo:"
	echo "- Each range is in one Unicode plane (a>>16 == b>>16) (enforced)."
	echo "- Commit 04d6d90c (2023-10-30) or later, where table_zero.py"
	echo "  includes zero-width Cf chars (else need to add manual tests)."
esac

[ "${1-}" != -- ] || shift

pwc_root=${1:-.}
pwc_git() { git -C "$pwc_root" "$@"; }

zerowidth_py=$pwc_root/wcwidth/table_zero.py
widewidth_py=$pwc_root/wcwidth/table_wide.py

[ -r "$zerowidth_py" ] && [ -r "$widewidth_py" ] \
	|| err "missing $zerowidth_py or $widewidth_py. abort."

# latest unicode version from table_wide.py (e.g. from "    '10.0.0': (")
ver=$(grep "^\s*'[0-9]" < "$widewidth_py" | tail -n1 | sed "s/.*'\(.*\)'.*/\1/")

# stdin -> stdout: extract the data of the last table (latest spec) from
# wcwidth/table_{wide,zero}.py (from https://github.com/jquast/wcwidth)
last_table() {
	awk "/^\s*'[0-9]/ { i=0 }             # new table -> reset
	     /^\s*\(0x/   { arr[++i] = \$0 }  # range (first, last)
	     END          { for (j=1; j <= i; ++j) print arr[j] }"
}

# stdin -> stdout, $1 is the range's (wc)width (0 or 2), e.g.
#   from: (0x0123a, 0x0123c,), # comment
#   to  : R(0x00123a, 0x00123c, 2),  /* comment */
# ranges bigger than half-plane (32769+ codepoints) are split to two.
py_data_to_c() {
	sed -e 's/[(),]/ /g' -e 's|#\(.*\)|/*\1 */|' | while read a b c; do
		# to support cross-plane ranges, we'd need to split them here,
		# but unlikely required, as all planes end in non-characters.
		[ $(($a>>16)) = $(($b>>16)) ] || err "not same plane -- $a $b"

		a=$(($a)) b=$(($b))  # some shells want decimal vars in $(())
		if [ "$((b-a))" -ge 32768 ]; then  # split to 15 bit ranges
			printf "R(0x%06x, 0x%06x, $1),  %s\n" $a $((a+32767)) "$c"
			a=$((a+32768)) c="/* (continued...) */"
		fi
		printf "R(0x%06x, 0x%06x, $1),  %s\n" $a $b "$c"
	done
}

data=$(last_table < "$zerowidth_py" | py_data_to_c 0 &&
       last_table < "$widewidth_py" | py_data_to_c 2) || err abort
data=$(printf %s\\n "$data" | sort)  # lexicographic here is also numeric

# sorted hex ranges and their (wc)width: R(first, last, {0|2}),[  /* ... */]
data() { printf %s\\n "$data"; }

repeat() { R=$2; while [ "$R" -gt 0 ]; do printf %s "$1"; R=$((R-1)); done; }

# data -> stdout: array such that a[p], a[p+1] are [from, to) of plane p data
mkplanes() {
	i=0 lastp=-1
	while read a b c; do
		p=$((${b%?} >> 16))  # plane (last >> 16)
		repeat "$i, " $((p-lastp))
		i=$((i+1)) lastp=$p
	done
	repeat "$i, " $((17-lastp))
}

indent() { sed -e 's/^/\t\t/' -e 's/\s*$//'; }  # also trim trailing spaces

cat << CFUNCTION
/* wcwidth - Unicode $ver, generated by $0.
 * Copyright (C) 2024 Avi Halachmi <avihpit at yahoo.com>
 * License: MIT
 *
 * Data imported on $(date -u -I) from https://github.com/jquast/wcwidth
 * commit $(pwc_git describe --tags) ($(pwc_git show --no-patch --format=%ci))
 */
int ${FUNC_ATTR-} wcwidth($u32 ucs)
{
	/* sorted ranges, "first" is clipped to 16 bit, and its high bits
	 * (plane) are deduced from the "planes" array below.
	 * (imported from ${zerowidth_py##*/} and ${widewidth_py##*/})
	 */
	static const struct range {
		uint16_t first;
		uint16_t iswide: 1;  /* bitfield order empirically faster */
		uint16_t difflast: 15;
	} ranges[] = {
	#define R(first, last, width) {first & 0xffff, width/2, last-first}
$(data | indent)
	#undef  R
	};

	/* planes[p], planes[p+1] are [from, to) at "ranges" for plane p */
	static const $u16 planes[/* 18 */] = {
$(data | mkplanes | fold -s -w 60 | indent)
	};

	/******* END OF STATIC DATA *******/

	$u32 p, bot, top;

	/* 0:0, 1..31:-1 (C0), 32..126:1 (isprint), 127..159:-1 (DEL, C1) */
	if (ucs < 160)
		return ((ucs + 1) & 127) > 32 ? 1 : ucs ? -1 : 0;

	/* out of range for "planes" (and non-unicode), non-characters. */
	/* (some also test surrogate halves, but not required by POSIX) */
	if (ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe)
		return -1;

	p = ucs >> 16;
	ucs &= 0xffff;

	for (bot = planes[p], top = planes[p+1]; bot < top; ) {
		$u32 mid = (bot + top) / 2;
		if (ucs < ranges[mid].first)
			top = mid;
		else if (ucs > ranges[mid].first + ranges[mid].difflast)
			bot = mid + 1;
		else
			return 2 * ranges[mid].iswide;
	}

	return 1;
}  /* wcwidth - Unicode $ver */
CFUNCTION
